{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "1fdde59d",
   "metadata": {},
   "source": [
    "Analysis of YouTube sets labeled just for Pranks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d5e3fdf5",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "YT1 = pd.read_csv('YTlabeled_set1_490.csv')\n",
    "\n",
    "print(\"First few rows of YT1:\")\n",
    "print(YT1.head())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "65d765c1",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "columns_of_interest = [\"isPrankManvik\", \"isPrankTess\", \"Manvikrevised\", \"Tessrevised\"]\n",
    "\n",
    "print(\"Unique entries in YT1:\")\n",
    "for column in columns_of_interest:\n",
    "    unique_values = YT1[column].unique()\n",
    "    print(f\"\\n{column}:\")\n",
    "    print(unique_values)\n",
    "\n",
    "print(\"\\nUnique entries in YT2:\")\n",
    "for column in columns_of_interest:\n",
    "    unique_values = YT2[column].unique()\n",
    "    print(f\"\\n{column}:\")\n",
    "    print(unique_values)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7d5dd116",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "rows_with_x = YT1[YT1['Tessrevised'] == 'x']\n",
    "\n",
    "num_rows_removed = len(rows_with_x)\n",
    "\n",
    "YT1 = YT1[YT1['Tessrevised'] != 'x']\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8ec1d1ce",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "def standardize_format(df, columns):\n",
    "    for column in columns:\n",
    "        df[column] = df[column].astype(str)  \n",
    "        df[column] = df[column].replace({'0': 0, '1': 1})  \n",
    "        df[column] = df[column].astype(int)  \n",
    "    return df\n",
    "\n",
    "YT1 = standardize_format(YT1, columns_of_interest)\n",
    "\n",
    "YT2 = standardize_format(YT2, columns_of_interest)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4aa7f66d",
   "metadata": {},
   "outputs": [],
   "source": [
    "#summary stats\n",
    "tess_non_prank = YT1['isPrankTess'].value_counts()[0]\n",
    "tess_prank = YT1['isPrankTess'].value_counts()[1]\n",
    "\n",
    "manvik_non_prank = YT1['isPrankManvik'].value_counts()[0]\n",
    "manvik_prank = YT1['isPrankManvik'].value_counts()[1]\n",
    "\n",
    "print(f\"Tess labeled as non-prank (0): {tess_non_prank}\")\n",
    "print(f\"Tess labeled as prank (1): {tess_prank}\\n\")\n",
    "\n",
    "print(f\"Manvik labeled as non-prank (0): {manvik_non_prank}\")\n",
    "print(f\"Manvik labeled as prank (1): {manvik_prank}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6082a262",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics import cohen_kappa_score\n",
    "\n",
    "agreement = YT1[YT1['isPrankTess'] == YT1['isPrankManvik']]\n",
    "percent_agreement = len(agreement) / len(YT1) * 100\n",
    "\n",
    "kappa = cohen_kappa_score(YT1['isPrankTess'], YT1['isPrankManvik'])\n",
    "\n",
    "print(f\"Percent agreement between Tess and Manvik: {percent_agreement:.2f}%\")\n",
    "print(f\"Cohen's kappa: {kappa:.2f}\")\n",
    "\n",
    "#repeat for set2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "87764e0d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Merge YT1 and YT2 \n",
    "YTmerged = pd.concat([YT1, YT2], ignore_index=True)\n",
    "\n",
    "print(\"First few rows of YTmerged:\")\n",
    "print(YTmerged.head())\n",
    "\n",
    "print(f\"\\nShape of YTmerged: {YTmerged.shape}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e0cec66d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# year stats\n",
    "percent_1s_per_year = YTmerged.groupby('year')['Tessrevised'].mean() * 100\n",
    "\n",
    "print(\"Percentage of 1 labels in 'Tessrevised' per year:\")\n",
    "print(percent_1s_per_year)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "557ca777",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "labeled_1 = YTmerged[YTmerged['Tessrevised'] == 1]\n",
    "\n",
    "channels_by_year = labeled_1.groupby('year')['channel_title'].apply(list)\n",
    "\n",
    "for year, channels in channels_by_year.items():\n",
    "    print(f\"Year: {year}\")\n",
    "    print(\"Channel Titles:\")\n",
    "    for channel in channels:\n",
    "        print(f\"- {channel}\")\n",
    "    print(\"\\n\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "57b8d2f8",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
